start_stations <-
merge(trips_data, stations_data, by.y = "station_id", by.x = "start_rental_zone_hal_id") %>% select(name, lat, lon)
colnames(start_stations) <-
c("name_start_station",
"lat_start_station",
"lon_start_station")
end_stations <-
merge(trips_data, stations_data, by.y = "station_id", by.x = "end_rental_zone_hal_id") %>% select(name, lat, lon)
colnames(end_stations) <-
c("name_end_station", "lat_end_station", "lon_end_station")
trips_data_latlon <- cbind(trips_data, start_stations, end_stations)require(geosphere)
# https://cran.r-project.org/web/packages/geosphere/vignettes/geosphere.pdf
get_time_of_day <- function(character_hhmm){
x <- as.numeric(str_replace_all(character_hhmm, ":", ""))
time_of_day <- ifelse(700 <= x & x < 1100, "Morgen",
ifelse(1100 <= x & x < 1300, "Vormittag",
ifelse(1300 <= x & x < 1500, "Mittag",
ifelse(1500 <= x & x < 1800, "Nachmittag",
ifelse(1800 <= x & x < 2200, "Abend", "Nacht")
)
)
)
)
return(factor(time_of_day, levels = c("Morgen", "Vormittag", "Mittag", "Nachmittag", "Abend", "Nacht")))
}
get_dist <- function(lon_start, lat_start, lon_end, lat_end) {
p_start = cbind(lon_start, lat_start)
p_end = cbind(lon_end, lat_end)
return(distGeo(p_start, p_end))
}
trips_data_fts <- trips_data_latlon %>%
mutate(roundtrip = ifelse(start_rental_zone_hal_id == end_rental_zone_hal_id, TRUE, FALSE),
timediff_hours = difftime(datetime_to, datetime_from, units = "hours"),
timediff_mins = difftime(datetime_to, datetime_from, units = "mins"),
time_of_day_start = get_time_of_day(hourmin_from),
time_of_day_end = get_time_of_day(hourmin_to),
distance_meters = get_dist(lon_start_station, lat_start_station,
lon_end_station, lat_end_station)) %>%
mutate(speed = ifelse(distance_meters==0, NA, as.numeric(timediff_mins)/distance_meters))# count of start station
start_trip_counts <- trips_data_fts %>% group_by(start_rental_zone_hal_id) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
start_trip_counts# count of end station
end_trip_counts <- trips_data_fts %>% group_by(end_rental_zone_hal_id) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
end_trip_counts# count of roundtrips
trips_data_fts %>%
group_by(roundtrip) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))# histogram on trip length
mw_timediff_mins <- signif(mean(as.numeric(trips_data_fts$timediff_mins)), digits = 6)
brks <- c(3,10,30,60,90,120,150,300)
trips_data_fts %>%
ggplot(aes(as.numeric(timediff_mins))) +
geom_histogram(bins = 200) +
geom_vline(xintercept = 30, col="black", lty="dashed") +
annotate("text", x = 30-1.5, y = 450, col="black", label="First 30 mins are free of charge", angle = 90) +
geom_vline(xintercept = mw_timediff_mins, col="red") +
annotate("text", x = mw_timediff_mins-1.5, y = 350, col="red", label="Mean TimeDiff per Trip", angle = 90) +
labs(title = paste0("Majority of trips are free of charge and mean trip duration is ", mw_timediff_mins, dig=2)) +
scale_x_log10(name ="log trip duration [mins]", labels = as.character(brks), breaks = brks) +
scale_y_continuous(n.breaks = 10)# proportion of trips with duration smaller than mean trip duration
trips_data_fts %>%
mutate(smaller_mw = timediff_mins <= mw_timediff_mins) %>%
group_by(smaller_mw) %>%
summarize(n = n()) %>%
mutate(freq = n / sum(n))# proportion of trips with duration smaller 30 mins
trips_data_fts %>%
mutate(smaller_30 = as.numeric(timediff_mins) <= 30) %>%
group_by(smaller_30) %>%
summarize(n = n()) %>%
mutate(freq = n / sum(n))# count of trip-starts as per day and per time of day
starts_by_date <- trips_data_fts %>%
group_by(date_from, time_of_day_start) %>%
summarise(n = n()) %>%
ungroup() %>%
group_by(date_from) %>%
mutate(freq = n/sum(n))
starts_by_datestarts_by_date %>%
ggplot(aes(x = as.POSIXct(date_from), y = n, group = time_of_day_start, col = time_of_day_start)) +
geom_line() +
scale_x_datetime(name = "Start Date", date_breaks = "day", date_labels = "%Y-%m-%d (%a)") +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
scale_y_continuous(breaks = seq(0,3500, by = 500))duration_by_date <- trips_data_fts %>%
group_by(date_from, time_of_day_start) %>%
summarise(mean_duration = as.numeric(mean(timediff_mins)),
sd_duration = as.numeric(sd(timediff_mins))) %>%
ungroup()
duration_by_dateduration_by_date %>%
ggplot(aes(x = as.POSIXct(date_from), y = mean_duration, group = time_of_day_start, col = time_of_day_start)) +
geom_errorbar(aes(ymin=mean_duration-sd_duration, ymax=mean_duration+sd_duration), width=.1) +
geom_line() +
geom_point() +
scale_x_datetime(name = "Start Date", date_breaks = "day", date_labels = "%Y-%m-%d (%a)") +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
labs(title = "mean duration of trips as per start date and time of day") Observations on trips and their duration: * majority of trips end at different stations as they started (94.6%) * majority of trips are free of charge (86.6%) * mean trip duration is ~ 20 minutes
Observations on start date of trips: * Rentals at night increase especially at the weekend * Vice versa rentals in the morning decrease especially at the weekend * Most trips are started in the afternoon or evening
start_trip_counts_latlon <- merge(start_trip_counts, stations_data, by.x = "start_rental_zone_hal_id", by.y = "station_id") %>%
mutate(rental_zone_id = start_rental_zone_hal_id, group = "Departure") %>%
select(-start_rental_zone_hal_id)
end_trip_counts_latlon <- merge(end_trip_counts, stations_data, by.x = "end_rental_zone_hal_id", by.y = "station_id") %>%
mutate(rental_zone_id = end_rental_zone_hal_id, group = "Arrival") %>%
select(-end_rental_zone_hal_id)
df_station_counts <- rbind(start_trip_counts_latlon, end_trip_counts_latlon)
# color mapping
# pie(rep(1:12), col=rainbow(12))
cols <- c(1,9)
factpal <- colorFactor(rainbow(12)[cols], df_station_counts$group)
m <- leaflet(df_station_counts) %>%
addTiles() %>%
addCircles(lng = ~lon, lat = ~lat, radius = ~sqrt(n)*10,
group = ~group, color = ~factpal(group),
popup = ~name, weight = 1) %>%
addLayersControl(
baseGroups = c("Departure", "Arrival"),
options = layersControlOptions(collapsed = TRUE))
mtrips_df <- trips_data_fts %>%
group_by(start_rental_zone_hal_id, end_rental_zone_hal_id) %>%
summarise(n_trips = n()
, n_dist_customer = n_distinct(customer_hal_id)
, mw_duration = mean(timediff_mins)
# , mean_speed = timediff_mins / (distance_meters / 1000)
) %>%
ungroup() %>%
mutate(freq_trips = n_trips / sum(n_trips),
freq_dist_customer = n_dist_customer / sum(n_dist_customer),
trips_per_customer = n_trips / n_dist_customer)
trips_lonlat <- merge(trips_df, stations_data,
by.x = "start_rental_zone_hal_id", by.y = "station_id") %>%
rename(start_name = name, start_lat = lat, start_lon = lon) %>%
merge(x = ., stations_data,
by.x = "end_rental_zone_hal_id", by.y = "station_id") %>%
rename(end_name = name, end_lat = lat, end_lon = lon)
trips_lonlat$route_id <- 1:nrow(trips_lonlat)
trips_lonlatlb <- 10
trips_lonlat_vis <- trips_lonlat %>% arrange(desc(n_trips)) %>% filter(n_trips >= lb)
m1 <- leaflet(trips_lonlat_vis) %>%
addTiles()
for(i in 1:nrow(trips_lonlat_vis)){
popup_txt = paste("Station (Departure):", trips_lonlat_vis[i,"start_name"], "<br>",
"Station (Arrival):", trips_lonlat_vis[i,"end_name"], "<br>",
"Number of Trips:", trips_lonlat_vis[i,"n_trips"], "<br>",
"Freq of Trips:", trips_lonlat_vis[i,"freq_trips"], "<br>",
"Number of customers:", trips_lonlat_vis[i,"n_dist_customer"], "<br>",
"Freq of Customers", trips_lonlat_vis[i,"freq_dist_customer"], "<br>",
"Mean Duration:", trips_lonlat_vis[i,"mw_duration"], "<br>",
"Trips per Customer:", trips_lonlat_vis[i,"trips_per_customer"], "<br>")
m1 <- m1 %>%
addPolylines(data = trips_lonlat_vis[i,],
lng = ~ c(start_lon, end_lon),
lat = ~ c(start_lat, end_lat),
popup = popup_txt,
weight = ~sqrt(n_trips-lb+1)*0.5)
}
m1The map shows connections of station coordinates (directed trips) that appear at least 25 times. Size of lines increases in number of trips. Thus connections can be identified that are used more oftne. Right now, direction of trips (lon1,lat1) -> (lon2, lat2) and (lon2,lat2) -> (lon1, lat1) is distinguished i.e. some stations are connection by (duplicated) lines. Removing those duplicates is postponed. This is essential to identify the line that connects two station and appears most in data. To get a first result the map above can be used.